clear
set more off
cap log close

***************************************************************************************************
* Program: prepare_analysis_data.do
***************************************************************************************************

************************************************************************
** set macros
************************************************************************
global raw_data "${prison_dir}/raw_data"
global intermediate_data "${prison_dir}/intermediate_data"
global code "${prison_dir}/code"
global analysis_data "${prison_dir}/analysis_data"
global output "${prison_dir}/output"

************************************************************************
** PART A. PREPROCESS DATA
************************************************************************
******************
* 1A. Prepare Zipcode-ZCTA crosswalk in CA 
******************

import excel "${raw_data}/ZIPCodetoZCTACrosswalk2021UDS.xlsx", sheet("Zip to ZCTA") firstrow clear

keep if STATE == "CA"

*tab zip_join_type
*assert ZIP_CODE == ZCTA if zip_join_type == "Zip matches ZCTA"

sort ZCTA ZIP_CODE
destring ZCTA ZIP_CODE, replace 

save "${intermediate_data}/CA_ZIPCodetoZCTACrosswalk2021UDS.dta", replace 

******************
* 1B. Create a unique list of ZCTAs (to be merge later)
******************
*use "${intermediate_data}/CA_ZCTA_cases_p100k_bymonth.dta", clear

use "${intermediate_data}/CA_ZIPCodetoZCTACrosswalk2021UDS.dta", clear 

gunique ZIP_CODE, generate(num_zip) by(ZCTA)

sort ZCTA
drop if ZCTA == ZCTA[_n-1]

keep ZCTA num_zip

save "${intermediate_data}/CA_ZCTA_uniq_list.dta", replace 


******************
* 2. Clean ZCTA demographics data (ACS 2017-2021)
******************

import delimited "${raw_data}/nhgis0022_ds254_20215_zcta.csv", clear

assert aon4e001 == aooce001

* calculate racial share 
gen pct_nh_white_acs_17_21 = aooce003 / aooce001 
gen pct_nh_black_acs_17_21 = aooce004 / aooce001 
gen pct_hisp_acs_17_21 = aooce012 / aooce001 
gen pct_nh_asian_acs_17_21 = aooce006 / aooce001 

ren aooce001 tot_population_acs_17_21 

keep tl_geo_id name_e *acs_17_21

ren tl_geo_id ZCTA

save "${intermediate_data}/ZCTA_pop_race_acs_17_21.dta", replace 
 

******************
* 3A. Create Clean zip code covid data
******************

import excel "${raw_data}/Zipcode_PRA_18185-032923.xlsx", sheet("CA Cases Zip") cellrange(A2:N1762) firstrow clear

* rename variables 
rename A zipcode
rename B cases_p100k1
rename C cases_p100k2 
rename D cases_p100k3 
rename E cases_p100k4 
rename F cases_p100k5
rename G cases_p100k6
rename H cases_p100k7 
rename I cases_p100k8 
rename J cases_p100k9 
rename K cases_p100k10 
rename L cases_p100k11 
rename M cases_p100k12 
drop N // all are missing 
drop if missing(zipcode) // several lines are missing 

* reshape data from wide to long 
reshape long cases_p100k, i(zipcode) j(month)

* merge to get the ZCTA variable 
ren zipcode ZIP_CODE
merge m:1 ZIP_CODE using "${intermediate_data}/CA_ZIPCodetoZCTACrosswalk2021UDS.dta", ///
	keep(master match) nogenerate 
ren ZIP_CODE zipcode 

drop STATE

save "${intermediate_data}/CA_zipcode_cases_p100k_bymonth.dta", replace 

******************
* 3B. Create ZCTA-level zip code covid data
******************
use "${intermediate_data}/CA_zipcode_cases_p100k_bymonth.dta", clear

* dummy: ZCTA == Zip code 
gen byte ZCTA_zip_same = zipcode == ZCTA
egen tag_zip = tag(zipcode)
tab ZCTA_zip_same if tag_zip 

egen tag_ZCTA = tag(ZCTA)

gunique zipcode, by(ZCTA) generate(num_zip)

tab num_zip if tag_ZCTA

* calculate max./min. cases in ZCTA (if ZCTA includes multiple zip code)
* Note: I calculaated max/min cases because I only has cases per 100k, 
* rather than cases/tag_ZCTApopulation in each zip code 
gegen max_cases_p100k = max(cases_p100k), by(ZCTA month)
gegen min_cases_p100k = min(cases_p100k), by(ZCTA month)

drop cases_p100k ZCTA_zip_same tag_ZCTA tag_zip 

sort ZCTA month
drop if ZCTA == ZCTA[_n-1] & month == month[_n-1]

save "${intermediate_data}/CA_ZCTA_cases_p100k_bymonth.dta", replace 


*************************************************************************************************************************************
* 4. Clean CA facility population data
* downloaded from: https://github.com/nrjones8/cdcr-population-data
*************************************************************************************************************************************

* 2019-12
import delimited "${raw_data}/monthly_cdcr_population.csv", clear

* get baseline prison population (in 2019/12)
keep if year == 2019 & month == 12

* there are two prisons that count female and male felons separately
* e.g. institution_name: FOL (FOLSOM SP) (FEMALE); FOL (FOLSOM SP) (MALE)
* create prison acronym variables 
split institution_name,  parse("(") generate(name)
replace name1 = trim(name1)
rename name1 acronym
drop name2
sort acronym
* drop unnecessary variables 
drop source_pdf_name civil_addict percent_occupied
drop if institution_name == "SQ (SAN QUENTIN SP) (FEMALE)" // only has 1 person 
* count the total population by the prison name acronym
foreach var of varlist population_felons total_population designed_capacity staffed_capacity{
 egen `var'_sum = total(`var'), by(acronym)
}

* collapse data to prison level
sort acronym
drop if acronym == acronym[_n-1]

* replace institution_name
replace institution_name = "FOL (FOLSOM SP) (FEMALE + MALE)" if inlist(institution_name, "FOL (FOLSOM SP) (FEMALE)", "FOL (FOLSOM SP) (MALE)")
keep institution_name acronym population_felons_sum total_population_sum staffed_capacity_sum designed_capacity_sum

* rename population variables 
rename *_sum *_201912

save "${intermediate_data}/monthly_cdcr_population_2019_12.dta", replace
 
 
*************************************************************************************************************************************
* 5. Merge CDCR population w/ Shapefile ID
*************************************************************************************************************************************

use "${raw_data}/Prison_Boundaries.dta", clear

drop if ANALYTICSAMPLE >=3 & ANALYTICSAMPLE !=. 

split NAME,  parse("(") generate(acronym) 

drop acronym1 
replace acronym2 = subinstr(acronym2, ")", "", .)

replace acronym2 = "ASP" if NAME == "AVENAL STATE PRISON"
replace acronym2 = "FOL" if acronym2 == "FSP"
replace acronym2 = "CCWF" if acronym2 == "CCFW"

replace acronym2 = NAME if missing(acronym2)

rename acronym2 acronym

merge 1:1 acronym using "${intermediate_data}/monthly_cdcr_population_2019_12.dta"

keep FID FACILITYID ANALYTICSAMPLE NAME ADDRESS ZIP POPULATION COUNTY COUNTYFIPS _ID acronym institution_name population_felons_201912 designed_capacity_201912 staffed_capacity_201912

* manually enter population for CAC 
* population counts from: 
* https://www.cdcr.ca.gov/research/wp-content/uploads/sites/174/2020/12/2019-Q4-CAC-SB601.pdf
* rule 12 - Total Number of Inmates - Dec-19: 2179
replace population_felons_201912 = 2179 if NAME == "CALIFORNIA CITY CORRECTIONAL CENTER (CAC)"
replace institution_name = "CAC" if NAME == "CALIFORNIA CITY CORRECTIONAL CENTER (CAC)"

* crosswalk: ZIP to ZCTA
* merge to get the ZCTA variable 
ren ZIP ZIP_CODE
merge m:1 ZIP_CODE using "${intermediate_data}/CA_ZIPCodetoZCTACrosswalk2021UDS.dta", ///
	keep(master match) nogenerate 
ren ZIP_CODE prison_zip
ren ZCTA prison_ZCTA 

save "${intermediate_data}/cdcr_population_w_ShapefileID.dta", replace
	
*************************************************************************************************************************************
* 6a. Calculate prison covid cases 
*************************************************************************************************************************************

import delimited "${raw_data}/CDCR_covid_19.csv", clear

* rename name to match 
gen NAME = upper(institutionname)
replace NAME = "AVENAL STATE PRISON" if NAME == "AVENAL STATE PRISON (ASP)"
replace NAME = "CALIFORNIA CITY CORRECTIONAL CENTER (CAC)" if NAME == "CALIFORNIA CITY CORRECTIONAL FACILITY (CAC)"
replace NAME = "CENTINELA STATE PRISON (CEN)" if NAME == "CALIFORNIA STATE PRISON-CENTINELA (CEN)"
replace NAME = "CALIFORNIA STATE PRISON, CORCORAN (COR)" if NAME == "CALIFORNIA STATE PRISON-CORCORAN (COR)"
replace NAME = "CALIFORNIA STATE PRISON, SACRAMENTO (SAC)" if NAME == "CALIFORNIA STATE PRISON-SACRAMENTO (SAC)"
replace NAME = "CALIFORNIA STATE PRISON, SOLANO (SOL)" if NAME == "CALIFORNIA STATE PRISON-SOLANO (SOL)"
replace NAME = "CENTRAL CALIFORNIA WOMEN'S FACILITY (CCFW)" if NAME == "CENTRAL CALIFORNIA WOMEN'S FACILITY (CCWF)"
replace NAME = "PLEASANT VALLEY STATE PRISON(PVSP)" if NAME == "PLEASANT VALLEY STATE PRISON (PVSP)"
replace NAME = "R J DONOVAN CORRECTIONAL FACILITY (RJD)" if NAME == "RICHARD J. DONOVAN CORRECTIONAL FACILITY (RJD)"
replace NAME = "CA SUBSTANCE ABUSE TREATMENT FACILITY (SATF)" if NAME == "SUBSTANCE ABUSE TREATMENT FACILITY (SATF)"

merge m:1 NAME using "${raw_data}/Prison_Boundaries.dta", keepusing(NAME ANALYTICSAMPLE) keep(match) nogenerate

gen Stata_Date_local = date(date, "YMD")
format Stata_Date_local %td

gen month = month(Stata_Date_local)

* collpase data to month level 
sort NAME Stata_Date_local
drop if NAME == NAME[_n+1] & month == month[_n+1] // this keeps the last day of each month

gen year = year(Stata_Date_local)
keep if year == 2020

* generate new cases in month 
sort NAME
gen new_cases = totalconfirmed - totalconfirmed[_n-1] if NAME == NAME[_n-1]
replace new_cases = totalconfirmed if missing(new_cases) & month == 3 & year == 2020

* generate yearly variable 
foreach var of varlist totalconfirmed totaldeaths distinctpatientstested{
	gen `var'_2020_ = `var' if month == 12
	gegen `var'_2020 = max(`var'_2020_), by(NAME)
	drop `var'_2020_
}
 
keep NAME ANALYTICSAMPLE month totalconfirmed totaldeaths distinctpatientstested ///
	totalconfirmed_2020 totaldeaths_2020 distinctpatientstested_2020 new_cases
	
order NAME, before(totalconfirmed)

save "${intermediate_data}/CDCR_covid_19_for_merged_monthly.dta", replace

***************************************************************************************************
* 6b. Calculate covid cases from prison for each ZCTA
***************************************************************************************************

use "${intermediate_data}/CDCR_covid_19_for_merged_monthly.dta", clear

* merge to get prison ZCTA
merge m:1 NAME using "${intermediate_data}/cdcr_population_w_ShapefileID.dta", keep(master match) nogenerate ///
	keepusing(prison_ZCTA prison_zip)

* calcualate total covid cases in ZCTA from prison 
gegen ZCTA_cases_fr_prison = total(new_cases), by(prison_ZCTA month)

* collapse to ZCTA-month
sort prison_ZCTA month
drop if prison_ZCTA == prison_ZCTA[_n-1] & month == month[_n-1]
keep prison_ZCTA month ZCTA_cases_fr_prison

* merge to get ZCTA population 
rename prison_ZCTA ZCTA 
merge m:1 ZCTA using  "${intermediate_data}/ZCTA_pop_race_acs_17_21.dta", ///
	keepusing(tot_population_acs_17_21) keep(master match) nogenerate

* calculate per 100k cases 
gen ZCTA_cases_fr_pri_p100k = ZCTA_cases_fr_prison * 100000 / tot_population_acs_17_21

save "${intermediate_data}/CA_ZCTA_covid_from_prison_monthly.dta", replace

***************************************************************************************************
* 7. Generate a list of unique ZCTA that contains prison
***************************************************************************************************

use "${intermediate_data}/cdcr_population_w_ShapefileID.dta", clear 

* drop facilities that are not state prison in our analytic sample (there are 35 state prisons)
drop if missing(population_felons_201912)

sort prison_ZCTA 
drop if prison_ZCTA == prison_ZCTA[_n-1]

keep prison_ZCTA 
rename prison_ZCTA ZCTA 

gen byte has_state_prison = 1

save "${intermediate_data}/prison_ZCTA_list.dta", replace 
	
************************************************************************
* 8. compute employment for Correctional institutions by county 
* data source (QCEW-BLS):
* https://data.bls.gov/cew/apps/data_views/data_views.htm#tab=Tables
* https://www.bls.gov/cew/classifications/ownerships/ownership-titles.htm
* 92214	NAICS 92214 Correctional institutions
// https://www.bls.gov/cew/about-data/documentation-guide.htm
************************************************************************

import delimited "${raw_data}/92214_QCEW_BLS.csv", clear

keep if own_code == 2

* extract state
gen state_fips = substr(area_fips, 1, 2)
keep if state_fips == "06"
drop if area_fips == "06999" // unknown, undefined areas 
drop if area_fips == "06000" // California -- Statewide

destring state_fips area_fips, replace 

keep area_fips disclosure_code month3_emplvl qtrly_estabs

rename month3_emplvl emplvl_062020

save "${intermediate_data}/CA_92214_QCEW_BLS_062020.dta", replace

************************************************************************
** PART B. GENERATE ANALYSIS DATA
************************************************************************

// generate prison-ZCTA contact matrix using LODES data
run "${code}/process_lodes_data.do"

// load program read_prison_data
run "${code}/read_prison_data.do"

// generate San Quentin Event Study Analysis data
read_prison_data, prison("SAN QUENTIN STATE PRISON (SQ)")
save "${analysis_data}/San_Quentin_main_analysis_data.dta", replace

// generate Corcoran Event Study Analysis data
read_prison_data, prison("CALIFORNIA STATE PRISON, CORCORAN (COR)")
save "${analysis_data}/Corcoran_analysis_data.dta", replace

// generate San Quentin Event Study Analysis data (Drop one Ping)
read_prison_data, prison("SAN QUENTIN STATE PRISON (SQ)") contact_matrix_path("prison_time_from_Jun-OctHome_ZCTA_DropOnePinginJune.dta")
save "${analysis_data}/San_Quentin_drop_one_ping_analysis_data.dta", replace

// generate San Quentin Event Study Analysis data (Alternative Home Definition)
read_prison_data, prison("SAN QUENTIN STATE PRISON (SQ)") contact_matrix_path("prison_time_from_Jan-OctHome_ZCTA.dta")
save "${analysis_data}/San_Quentin_alternative_home_analysis_data.dta", replace


// Obtain a list of ZCTAs connected w/ ASP & CVSP (Used for Table S5, S6)
read_prison_data, prison("AVENAL STATE PRISON")	
keep if tag_ZCTA_contact & treated == 1 
keep ZCTA_contact hr_pri_fr_home6
gen treated_ASP = 1
rename hr_pri_fr_home6 hr_pri_fr_home6_ASP
save "${intermediate_data}/ASP_treated_ZCTA.dta", replace 

read_prison_data, prison("CHUCKAWALLA VALLEY STATE PRISON (CVSP)")
keep if tag_ZCTA_contact & treated == 1 
keep ZCTA_contact hr_pri_fr_home6
gen treated_CVSP = 1
rename hr_pri_fr_home6 hr_pri_fr_home6_CVSP
save "${intermediate_data}/CVSP_treated_ZCTA.dta", replace 



